# Dylan Carlson Sirvent León (dylancarlson@g.harvard.edu)
# Code defines function bind_eia It is binds all eia xlsx files from the specified "Planned" worksheet.

library(tidyverse)
library(tidylog)
library(here)
library(janitor)
library(modelsummary)
library(purrr)
library(glue)

# Disable all progress indicators and verbose output to prevent log clutter
options(
  readxl.progress = FALSE,
  curl.progress = FALSE,
  httr_progress = FALSE,
  dplyr.show_progress = FALSE
)

source(here("analysis", "visibility", "processing", "process_eia", "read_eia.R"))

# Validation Functions ----

# Validate raw input data
validate_input_data <- function(df) {
  message("=== Input Data Validation ===")
  
  # Check data dimensions
  message(glue("Raw data: {nrow(df)} rows, {ncol(df)} columns"))
  
  # Check for completely empty dataset
  if (nrow(df) == 0) stop("ERROR: No data loaded from EIA files")
  
  # Check required columns exist
  required_cols <- c("plant_id", "generator_id", "net_summer_capacity_mw", "latitude", "longitude", "technology", "status")
  missing_cols <- setdiff(required_cols, names(df))
  if (length(missing_cols) > 0) {
    stop(glue("ERROR: Missing required columns: {paste(missing_cols, collapse=', ')}"))
  }
  
  # Check data types and ranges
  message(glue("Unique ids: {length(unique(df$plant_id))}"))
  message(glue("Technologies: {length(unique(df$technology))}"))
  message(glue("Status categories: {length(unique(df$status))}"))
  
  # Check for suspicious values
  if (any(df$latitude > 90 | df$latitude < -90, na.rm = TRUE)) {
    message("Invalid latitude values detected")
  }
  if (any(df$longitude > 180 | df$longitude < -180, na.rm = TRUE)) {
    message("Invalid longitude values detected")
  }
  if (any(df$net_summer_capacity_mw < 0, na.rm = TRUE)) {
    message("Negative capacity values detected")
  }
  
  return(df)
}

# Validate after status categorization
validate_status_processing <- function(df_before, df_after) {
  message("=== Status Processing Validation ===")
  
  rows_before <- nrow(df_before)
  rows_after <- nrow(df_after)
  rows_dropped <- rows_before - rows_after
  
  message(glue("Rows before status filter: {rows_before}"))
  message(glue("Rows after status filter: {rows_after}"))
  message(glue("Rows dropped (Other status): {rows_dropped} ({round(100*rows_dropped/rows_before, 1)}%)"))
  
  # Check if we lost too much data
  if (rows_dropped / rows_before > 0.5) {
    warning("WARNING: Dropped more than 50% of data in status filtering")
  }
  
  # Validate status categories
  valid_statuses <- c("pre-construction", "construction")
  invalid_statuses <- setdiff(unique(df_after$status), valid_statuses)
  if (length(invalid_statuses) > 0) {
    stop(glue("ERROR: Invalid status categories remain: {paste(invalid_statuses, collapse=', ')}"))
  }
  
  return(df_after)
}

# Validate geographic data
validate_geographic_data <- function(df) {
  message("=== Geographic Data Validation ===")
  
  missing_coords <- sum(is.na(df$latitude) | is.na(df$longitude))
  total_rows <- nrow(df)
  
  message(glue("Missing coordinates: {missing_coords}/{total_rows} ({round(100*missing_coords/total_rows, 1)}%)"))
  
  if (missing_coords / total_rows > 0.2) {
    warning("WARNING: More than 20% of projects missing coordinates")
  }
  
  # Check coordinate ranges for US projects
  us_lat_range <- df$latitude >= 24 & df$latitude <= 72  # Continental US + Alaska + Hawaii
  us_lon_range <- df$longitude >= -180 & df$longitude <= -65 # US longitude range
  
  outside_us <- sum(!(us_lat_range & us_lon_range), na.rm = TRUE)
  if (outside_us > 0) {
    message(glue("Projects outside typical US coordinates: {outside_us}"))
  }
  
  return(df)
}

# Validate panel structure
validate_panel_structure <- function(df) {
  message("=== Panel Structure Validation ===")
  
  # Check for duplicates
  duplicates <- df %>%
    group_by(id, date) %>%
    summarise(n = n(), .groups = "drop") %>%
    filter(n > 1)
  
  if (nrow(duplicates) > 0) {
    warning(glue("WARNING: {nrow(duplicates)} plant-date combinations have duplicates"))
  }
  
  # Check panel balance
  plants <- length(unique(df$id))
  dates <- length(unique(df$date))
  expected_obs <- plants * dates
  actual_obs <- nrow(df)
  
  message(glue("Panel dimensions: {plants} plants × {dates} dates"))
  message(glue("Expected observations: {expected_obs}"))
  message(glue("Actual observations: {actual_obs}"))
  message(glue("Panel completeness: {round(100*actual_obs/expected_obs, 1)}%"))
  
  return(df)
}

# Validate technology filtering and capacity data
validate_technology_capacity <- function(df) {
  message("=== Technology & Capacity Validation ===")
  
  # Check technology distribution
  tech_summary <- df %>%
    group_by(technology) %>%
    summarise(
      n_projects = n_distinct(id),
      total_capacity = sum(cap, na.rm = TRUE),
      avg_capacity = mean(cap, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    arrange(desc(total_capacity))
  
  message("Technology summary:")
  print(tech_summary)
  
  # Check for suspicious capacity values
  capacity_issues <- df %>%
    group_by(technology) %>%
    summarise(
      zero_capacity = sum(cap == 0, na.rm = TRUE),
      negative_capacity = sum(cap < 0, na.rm = TRUE),
      missing_capacity = sum(is.na(cap)),
      very_large = sum(cap > 1000, na.rm = TRUE), # >1GW projects
      .groups = "drop"
    )
  
  message("Capacity issues by technology:")
  print(capacity_issues)
  
  if (any(capacity_issues$negative_capacity > 0)) {
    warning("WARNING: Negative capacity values detected")
  }
  
  return(df)
}

# Validate final output
validate_final_output <- function(df) {
  message("=== Final Output Validation ===")
  
  # Check completeness
  message(glue("Final dataset: {nrow(df)} observations"))
  message(glue("Plants: {length(unique(df$id))}"))
  message(glue("Date range: {min(df$date)} to {max(df$date)}"))
  
  # Check months_post_cont variable
  months_summary <- summary(df$months_post_cont)
  message("Months post-construction summary:")
  print(months_summary)
  
  if (max(df$months_post_cont, na.rm = TRUE) > 50) {
    warning("WARNING: Some projects have >50 months post-construction")
  }
  
  # Check for missing values in key variables
  missing_check <- df %>%
    summarise(
      missing_cap = sum(is.na(cap)),
      missing_coords = sum(is.na(lat) | is.na(lon)),
      missing_technology = sum(is.na(technology)),
      missing_status = sum(is.na(status))
    )
  
  if (any(missing_check > 0)) {
    message("Missing values in key variables:")
    print(missing_check)
  }
  
  return(df)
}

bind_eia <- function(directory_path = here("data", "input", "EIA-860M"), 
                     ignore_year = NULL, 
                     focus_year = NULL, 
                     sheet_name = NULL) {
  
    files <- list.files(directory_path, pattern = "\\.xlsx$", full.names = TRUE)
  
    if (!is.null(ignore_year)) {
    files_to_process <- files[!grepl(paste0("^", ignore_year), basename(files))]
  } else if (!is.null(focus_year)) {
    files_to_process <- files[grepl(paste0("^", focus_year), basename(files))]
  } else {
    files_to_process <- files
  }
  
  # Progress setup
  total_files <- length(files_to_process)
  message(glue("Loading {total_files} EIA files from sheet '{sheet_name}'..."))
  
  all_data <- files_to_process |>
    imap_dfr(function(file_path, file_index) {
      file_name <- basename(file_path)
      year <- substr(file_name, 1, 4)
      month_no <- substr(file_name, 6, 7)
      
      # Progress indicator - show every 10th file or first/last
      if (file_index == 1 || file_index == total_files || file_index %% 10 == 0) {
        message(glue("  Processing file {file_index}/{total_files}: {year}-{month_no}"))
      }
      
      eia_result <- read_eia(year = as.numeric(year), month_no = month_no)
      
      sheet_data <- eia_result[[sheet_name]]
      
      sheet_data$year <- year
      sheet_data$month <- month_no
      
      # Clean names for each file
      sheet_data <- clean_names(sheet_data)
      
      return(sheet_data)
    })
  
  message(glue("[OK] Successfully loaded {total_files} files with {nrow(all_data)} total observations"))
  return(all_data)
}

eia_planned <- bind_eia(sheet_name = "Planned")
# Remove rows with all NA
eia_planned <- eia_planned %>%
  filter(!is.na(entity_id) & !grepl("notes", entity_id, ignore.case = TRUE))
eia_planned <- validate_input_data(eia_planned)
# Create unique ID
eia_planned <- eia_planned %>%
  mutate(id = paste(plant_id, generator_id, sep = "_"))

g <- eia_planned %>%
  mutate(across(c(net_summer_capacity_mw, latitude, longitude, nameplate_capacity_mw:net_winter_capacity_mw), ~ as.numeric(.x)))

# Create a dataframe: plant x month-year x status x type
# and then expand that data for after a plant was constructed
g$date <- with(g, ym(paste(year, month, sep = "-")))
g <- g %>%
  dplyr::select(id, date, net_summer_capacity_mw, latitude, longitude, technology, status)
# simplify categories into whether ground has been broken or not on construction
g <- g %>%
  mutate(
    status = case_when(
      status == "(L) Regulatory approvals pending. Not under construction" ~ "pre-construction",
      status == "(OA) Out of service but expected to return to service in next calendar year"~ "Other",
      status == "(OP) Operating" ~ "construction",
      status == "(OT) Other" ~ "Other",
      status == "(P) Planned for installation, but regulatory approvals not initiated" ~ "pre-construction",
      status == "(T) Regulatory approvals received. Not under construction" ~ "pre-construction",
      status == "(TS) Construction complete, but not yet in commercial operation" ~ "construction",
      status == "(U) Under construction, less than or equal to 50 percent complete" ~ "construction",
      status == "(V) Under construction, more than 50 percent complete" ~ "construction"
    )
  )
# Drop other status
g_before_filter <- g
g <- g %>%
  filter(status != "Other")
g <- validate_status_processing(g_before_filter, g)

# Save complete dataset
saveRDS(g, file = here("data", "inter", "eia_processed.rds"))

# Subset to primary dataset for analysis
g <- g %>%
  filter(date >= ym("2022-08"))
g <- validate_geographic_data(g)

# Create average location or capacity to account for how information could be updated in subsequent reports
g <- g %>%
  group_by(id) %>%
  mutate(
    net_summer_capacity_mw = mean(net_summer_capacity_mw, na.rm = TRUE),
    latitude = mean(latitude, na.rm = TRUE),
    longitude = mean(longitude, na.rm = TRUE)
  ) %>%
  rename(
    cap = net_summer_capacity_mw,
    lat = latitude,
    lon = longitude
  )

# Trim the few observations missing geo-codes
g <- g %>%
  filter(!is.na(lon) & !is.na(lat))

# Get names for panel dataset
plant_id <- unique(g$id)
date_ym <- unique(g$date)
full.df <- expand.grid(plant_id, date_ym)
names(full.df) <- c("id", "date")

g.out <- left_join(full.df, g, by = c("id", "date"))

g.out <- g.out %>%
  arrange(id, date) %>%
  group_by(id) %>%
  tidyr::fill(c(cap, lat, lon, status, technology), .direction = "down") %>%
  ungroup() %>%
  filter(!is.na(status))

g.out <- distinct(g.out)
g.out <- validate_panel_structure(g.out)

g.out <- g.out %>%
  group_by(id) %>%
  mutate(months_post_cont = cumsum(status == "construction"))

g.out <- g.out %>%
  filter(
    technology %in% c(
      "Batteries", "Natural Gas Fired Combined Cycle", "Natural Gas Fired Combustion Turbine",
      "Natural Gas Internal Combustion Engine", "Natural Gas Steam Turbine", "Natural Gas with Compressed Air Storage",
      "Offshore Wind Turbine", "Onshore Wind Turbine", "Solar Photovoltaic"
    )
  ) %>%
  mutate(
    technology = case_when(
      grepl("Natural Gas", technology) ~ "Gas",
      T ~ technology
      )
  )
g.out <- validate_technology_capacity(g.out)
g.out <- validate_final_output(g.out)

# Create measure of median project capacity
g.out <- g.out %>%
  group_by(technology) %>%
  mutate(
    above_median_cap = as.integer(cap > median(cap, na.rm = TRUE)),
    above_1sd_cap = as.integer(cap > mean(cap, na.rm = TRUE) + sd(cap, na.rm = TRUE))
  ) %>%
  ungroup()

saveRDS(g.out, file = here("data", "inter", "eia_survey_processed.rds"))
message("[OK] EIA data processed and validated successfully")